{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7 调整树的参数： colsample_bytree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from xgboost import XGBClassifier\n",
    "import xgboost as xgb\n",
    "\n",
    "import pandas as pd \n",
    "import numpy as np\n",
    "\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "from sklearn.metrics import log_loss\n",
    "\n",
    "from matplotlib import pyplot\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# read in data，数据在xgboost安装的路径下的demo目录,现在我们将其copy到当前代码下的data目录\n",
    "dpath = './data/'\n",
    "dtrain = xgb.DMatrix(dpath + 'RentListingInquries_FE_train.bin')\n",
    "dtest = xgb.DMatrix(dpath + 'RentListingInquries_FE_test.bin')\n",
    "train = pd.read_csv(dpath + 'RentListingInquries_FE_train.csv')\n",
    "test = pd.read_csv(dpath + 'RentListingInquries_FE_test.csv')\n",
    "\n",
    "\n",
    "\n",
    "# drop ids and get labels\n",
    "y_train = train['interest_level']\n",
    "train = train.drop([\"interest_level\"], axis=1)\n",
    "X_train = np.array(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# prepare cross validation\n",
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "第二轮参数调整得到的n_estimators最优值（234），其余参数继续默认值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'colsample_bytree': [0.6, 0.7, 0.8]}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#subsample = [i/10.0 for i in range(6,10)]\n",
    "colsample_bytree = [i/10.0 for i in range(6,9)]\n",
    "#param_6 = dict(subsample=subsample, colsample_bytree=colsample_bytree)\n",
    "param_7 = dict(colsample_bytree=colsample_bytree)\n",
    "param_7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Anaconda2\\envs\\python3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:667: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "([mean: -0.58010, std: 0.00266, params: {'colsample_bytree': 0.6},\n",
       "  mean: -0.58055, std: 0.00153, params: {'colsample_bytree': 0.7},\n",
       "  mean: -0.57940, std: 0.00198, params: {'colsample_bytree': 0.8}],\n",
       " {'colsample_bytree': 0.8},\n",
       " -0.57940497323533569)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgb7 = XGBClassifier(\n",
    "        learning_rate =0.1,\n",
    "        n_estimators=234,  #第二轮参数调整得到的n_estimators最优值\n",
    "        max_depth=6,\n",
    "        min_child_weight=4,\n",
    "        gamma=0,\n",
    "        subsample=0.8,\n",
    "        colsample_bytree=0.8,\n",
    "        colsample_bylevel = 0.7,\n",
    "        objective= 'multi:softprob',\n",
    "        seed=3)\n",
    "\n",
    "\n",
    "gsearch7 = GridSearchCV(xgb7, param_grid = param_7, scoring='neg_log_loss',n_jobs=-1, cv=kfold)\n",
    "gsearch7.fit(X_train , y_train)\n",
    "\n",
    "gsearch7.grid_scores_, gsearch7.best_params_,     gsearch7.best_score_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "最优值在边界，考虑用0.9和1试试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'colsample_bytree': [0.9, 1.0]}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#subsample = [i/10.0 for i in range(6,10)]\n",
    "colsample_bytree = [i/10.0 for i in range(9,11)]\n",
    "#param_6 = dict(subsample=subsample, colsample_bytree=colsample_bytree)\n",
    "param_7_1 = dict(colsample_bytree=colsample_bytree)\n",
    "param_7_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Anaconda2\\envs\\python3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:667: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "([mean: -0.58040, std: 0.00222, params: {'colsample_bytree': 0.9},\n",
       "  mean: -0.58071, std: 0.00258, params: {'colsample_bytree': 1.0}],\n",
       " {'colsample_bytree': 0.9},\n",
       " -0.5803971347424125)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgb7_1 = XGBClassifier(\n",
    "        learning_rate =0.1,\n",
    "        n_estimators=234,  #第二轮参数调整得到的n_estimators最优值\n",
    "        max_depth=6,\n",
    "        min_child_weight=4,\n",
    "        gamma=0,\n",
    "        subsample=0.8,\n",
    "        colsample_bytree=0.8,\n",
    "        colsample_bylevel = 0.7,\n",
    "        objective= 'multi:softprob',\n",
    "        seed=3)\n",
    "\n",
    "\n",
    "gsearch7_1 = GridSearchCV(xgb7_1, param_grid = param_7_1, scoring='neg_log_loss',n_jobs=-1, cv=kfold)\n",
    "gsearch7_1.fit(X_train , y_train)\n",
    "\n",
    "gsearch7_1.grid_scores_, gsearch7_1.best_params_,     gsearch7_1.best_score_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "还是0.8更优"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
